Analiza pokazała że
Analiza zbioru danych pozwoliła na powiązanie rozmiaru poławianych śledzi z atrybutami ze zbioru danych. Znaczna część atrybutóW zawiera silne korelacje pomiędzy sobą. Na podstawie wykorzystanych w badaniu danych można wywnioskować, że realny wpływ na długość poławianych śledzi mają między innym: 1. temperatura przy powierzchni wody [°C]; 2. dostępność planktonu [zagęszczenie Calanus finmarchicus gat. 2]; 3. oscylacja północnoatlantycka [mb].
library(corrplot)
## corrplot 0.84 loaded
library(DAAG)
## Loading required package: lattice
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
set.seed(5)
filename <- "sledzie.csv"
df <- read.csv(filename, na.strings=c("?"))
print("Is data.frame instance?")
## [1] "Is data.frame instance?"
print(is.data.frame(df))
## [1] TRUE
df <- df[, names(df) != 'X'] # Remove column X
df <- data.frame(
sapply( df,
function(x)ifelse(is.na(x), mean(x, na.rm=TRUE), x)
)
)
print("Size of cleared data:")
## [1] "Size of cleared data:"
print(nrow(df))
## [1] 52582
print(summary(df))
## length cfin1 cfin2 chel1
## Min. :19.0 Min. : 0.0000 Min. : 0.0000 Min. : 0.000
## 1st Qu.:24.0 1st Qu.: 0.0000 1st Qu.: 0.2778 1st Qu.: 2.469
## Median :25.5 Median : 0.1333 Median : 0.7012 Median : 6.083
## Mean :25.3 Mean : 0.4458 Mean : 2.0248 Mean :10.006
## 3rd Qu.:26.5 3rd Qu.: 0.3603 3rd Qu.: 1.9973 3rd Qu.:11.500
## Max. :32.5 Max. :37.6667 Max. :19.3958 Max. :75.000
## chel2 lcop1 lcop2 fbar
## Min. : 5.238 Min. : 0.3074 Min. : 7.849 Min. :0.0680
## 1st Qu.:13.589 1st Qu.: 2.5479 1st Qu.:17.808 1st Qu.:0.2270
## Median :21.435 Median : 7.1229 Median :25.338 Median :0.3320
## Mean :21.221 Mean : 12.8108 Mean :28.419 Mean :0.3304
## 3rd Qu.:27.193 3rd Qu.: 21.2315 3rd Qu.:37.232 3rd Qu.:0.4560
## Max. :57.706 Max. :115.5833 Max. :68.736 Max. :0.8490
## recr cumf totaln sst
## Min. : 140515 Min. :0.06833 Min. : 144137 Min. :12.77
## 1st Qu.: 360061 1st Qu.:0.14809 1st Qu.: 306068 1st Qu.:13.63
## Median : 421391 Median :0.23191 Median : 539558 Median :13.86
## Mean : 520367 Mean :0.22981 Mean : 514973 Mean :13.87
## 3rd Qu.: 724151 3rd Qu.:0.29803 3rd Qu.: 730351 3rd Qu.:14.16
## Max. :1565890 Max. :0.39801 Max. :1015595 Max. :14.73
## sal xmonth nao
## Min. :35.40 Min. : 1.000 Min. :-4.89000
## 1st Qu.:35.51 1st Qu.: 5.000 1st Qu.:-1.89000
## Median :35.51 Median : 8.000 Median : 0.20000
## Mean :35.51 Mean : 7.258 Mean :-0.09236
## 3rd Qu.:35.52 3rd Qu.: 9.000 3rd Qu.: 1.63000
## Max. :35.61 Max. :12.000 Max. : 5.08000
for(name in names(df)){
d <- density(df[, name])
plot(d, main=name)
}
corr_matrix <- cor(df)
corrplot(corr_matrix, method="pie", col=c("black", "white"), bg="lightblue")
number_to_aggregate <- 100
aggregated <- aggregate(df
, list(rep(1:(nrow(df)%/%number_to_aggregate+1), each=number_to_aggregate, len=nrow(df)))
, mean)[-1]
aggregated$timeline <- as.numeric(row.names(aggregated))
p <- ggplot(aggregated, aes(timeline, length, size=sst)) + geom_point() + theme_bw()
plotly::ggplotly(p)
p_sst <- ggplot(aggregated, aes(timeline, sst, size=length)) + geom_point() + theme_bw()
plotly::ggplotly(p_sst)
# sst has the biggest correlation with length
# cfin2 has big correlation with length and relatively small with sst
# nao has correlation with length but does not have big correlation with sst or cfin2
# There is no need in adding more attributes to regression as the rest of attributes are very much correlated
formula <- length ~ sst + cfin2 + nao
linearMod <- lm(formula, data=df) # build linear regression model on full data
summary(linearMod)
##
## Call:
## lm(formula = formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3744 -0.9972 0.0312 1.0028 6.7078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.8584067 0.2511569 190.552 <2e-16 ***
## sst -1.6256714 0.0180124 -90.253 <2e-16 ***
## cfin2 -0.0007262 0.0018283 -0.397 0.691
## nao -0.0331445 0.0033535 -9.884 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.479 on 52578 degrees of freedom
## Multiple R-squared: 0.1996, Adjusted R-squared: 0.1996
## F-statistic: 4371 on 3 and 52578 DF, p-value: < 2.2e-16
cvResults <- suppressWarnings(CVlm(df, form.lm=formula, m=5, dots=FALSE, seed=29, legend.pos="topleft", printit=FALSE, main="Small symbols are predicted values while bigger ones are actuals.")); # performs the CV
attr(cvResults, 'ms')
## [1] 2.186963